## session info: ----
# R version 4.3.1 (2023-06-16)
# Platform: aarch64-apple-darwin20 (64-bit)
# Running under: macOS Ventura 13.5.2
# 
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
# [1] collapse_2.0.3      SortedEffects_1.7.0 dotwhisker_0.7.4    haven_2.5.3         lubridate_1.9.2     forcats_1.0.0      
# [7] stringr_1.5.0       dplyr_1.1.3         purrr_1.0.2         readr_2.1.4         tidyr_1.3.0         tibble_3.2.1       
# [13] ggplot2_3.4.3       tidyverse_2.0.0    
# 
# loaded via a namespace (and not attached):
# [1] gtable_0.3.4       bayestestR_0.13.1  xfun_0.40          htmlwidgets_1.6.2  insight_0.19.5     lattice_0.21-8    
# [7] tzdb_0.4.0         vctrs_0.6.3        tools_4.3.1        generics_0.1.3     datawizard_0.9.0   parallel_4.3.1    
# [13] fansi_1.0.4        cluster_2.1.4      pkgconfig_2.0.3    Matrix_1.6-1       data.table_1.14.8  checkmate_2.2.0   
# [19] readxl_1.4.3       lifecycle_1.0.3    compiler_4.3.1     MatrixModels_0.5-2 munsell_0.5.0      ggstance_0.3.6    
# [25] janitor_2.2.0      snakecase_0.11.1   SparseM_1.81       quantreg_5.97      htmltools_0.5.6    htmlTable_2.4.1   
# [31] Formula_1.2-5      pillar_1.9.0       MASS_7.3-60        Hmisc_5.1-0        rpart_4.1.19       boot_1.3-28.1     
# [37] tidyselect_1.2.0   digest_0.6.33      stringi_1.7.12     splines_4.3.1      fastmap_1.1.1      grid_4.3.1        
# [43] colorspace_2.1-0   cli_3.6.1          magrittr_2.0.3     fastDummies_1.7.3  base64enc_0.1-3    survival_3.5-5    
# [49] utf8_1.2.3         withr_2.5.0        foreign_0.8-84     scales_1.2.1       backports_1.4.1    timechange_0.2.0  
# [55] rmarkdown_2.25     nnet_7.3-19        gridExtra_2.3      cellranger_1.1.0   hms_1.1.3          pbapply_1.7-2     
# [61] evaluate_0.21      knitr_1.43         parameters_0.21.1  rlang_1.1.1        Rcpp_1.0.11        xtable_1.8-4      
# [67] glue_1.6.2         rstudioapi_0.15.0  R6_2.5.1      

## load libraries: ----

#install.packages("tidyverse")
#install.packages("haven")
#install.packages("SortedEffects")
#install.packages("collapse")
#install.packages("fastDummies")

#' erase the correspondent # above (and run the correspondent code line) if you
#' need to install one of the needed packages

## load dataset: ----
SGenderMetaD <- haven::read_dta("/Users/roussille/Dropbox/askgap/replication/Data/SGenderMetaD.dta") 

## preparing data: ----
SGenderMetaD <- SGenderMetaD |>
  # time restriction of observations:
  dplyr::filter(smonthyear < 705) |>
  # collapse at the sid level:
  dplyr::arrange(sid, batch_start_date) |>
  collapse::collap(logs_salary + female  + s_total_exp + s_total_exp2 +
                     days_unemployed + s_grad_year +
                     s_primary_field_exp + s_choice_location + s_current_location +
                     s_primary_field + cat_degree + ranking + csdegree + ivyplus + s_contract +
                     s_search_status + s_sponsorship + s_remote + employed + faang +
                     s_nb_reports +  highest_jobtitle + linkedin + website + smonthyear + nbpastbatch + batch_length +
                     html  + java  + python  + javascript  + ios  + pointnet  + android +
                     sql  + c  + ruby +  dataanalysis + php  + nodejs + css  + react + 
                     go + r + saas + linux + agile + angular + swift + hadoop + scala +
                     wishind_ag + wishind_analytics + wishind_auto + wishind_av + 
                     wishind_bank + wishind_biotech + wishind_game + wishind_cleantech + 
                     wishind_clothes + wishind_cybersecurity + wishind_dating + wishind_com +
                     wishind_pay + wishind_storage + wishind_ecommerce + wishind_education +
                     wishind_energy + wishind_entpsoft + wishind_food + wishind_gov + 
                     wishind_hardware + wishind_health + wishind_hotel + wishind_hr + 
                     wishind_supplychain + wishind_infosys + wishind_insurance + 
                     wishind_legal + wishind_news + wishind_nonprofit + wishind_oilgas +
                     wishind_wellness + wishind_perssafe + wishind_pubsafe + 
                     wishind_realestate + wishind_research + wishind_retail + 
                     wishind_robotics + wishind_socialnet + wishind_sports + wishind_infra +
                     wishind_logistics + wishind_art + wishind_video +
                     wishskill_python + wishskill_java + wishskill_javascript + 
                     wishskill_productmanagement + wishskill_react + wishskill_html + 
                     wishskill_ml + wishskill_sql + wishskill_cplus + wishskill_dataanalysis +
                     wishskill_leadership + wishskill_css + wishskill_node + 
                     wishskill_productdevelopment + wishskill_csharp + wishskill_go +
                     wishskill_adobesuite + wishskill_aws + wishskill_management + 
                     wishskill_design + wishskill_r + wishskill_busdev + 
                     wishskill_typescript + wishskill_swift + wishskill_php + wishskill_ruby +
                     wishskill_office + wishskill_kubernetes + wishskill_c + wishskill_ios +
                     csize1_15 + csize16_50 + csize51_200 + csize201_500 + csize500_ +
                     cpath_ind + cpath_manager +
                     cgoal_culture + cgoal_mentor + cgoal_tech + cgoal_lead + cgoal_large +
                     cgoal_lsoc ~ sid, FUN = collapse::ffirst) |>
  # creating indicator variables:
  fastDummies::dummy_cols(c("female", 
                            "s_primary_field",
                            "s_primary_field_exp",
                            "s_current_location",
                            "s_choice_location",
                            "cat_degree",
                            "csdegree",
                            "s_contract",
                            "s_search_status",
                            "s_nb_reports",
                            "s_remote",
                            "nbpastbatch",
                            "batch_length",
                            "highest_jobtitle",
                            "ranking",
                            # time indicator:
                            "smonthyear")) |>
  # renaming indicator variables:
  dplyr::rename_with(~stringr::str_replace(.x,
                                           pattern = "_(\\d+)$",
                                           replacement = ".\\1"),
                     dplyr::starts_with(c("female", 
                                          "s_primary_field",
                                          "s_primary_field_exp",
                                          "s_current_location",
                                          "s_choice_location",
                                          "cat_degree",
                                          "csdegree",
                                          "s_contract",
                                          "s_search_status",
                                          "s_nb_reports",
                                          "s_remote",
                                          "nbpastbatch",
                                          "batch_length",
                                          "highest_jobtitle",
                                          "ranking"))) |>
  dplyr::rename_with(~stringr::str_replace(.x,
                                           pattern = "_",
                                           replacement = ""),
                     dplyr::starts_with("smonthyear")) |>
  dplyr::rename(female.male = female.0, 
                female.female = female.1,
                female.unknown = female.2)

## selecting needed variables' names for the following exercises: ----
sel_var_names <- SGenderMetaD |> 
  dplyr::select(dplyr::starts_with(c("s_primary_field_exp.",
                                     "s_total_exp",
                                     "s_choice_location.",
                                     "s_current_location.",
                                     "s_primary_field.",
                                     "cat_degree.",
                                     "s_contract.",
                                     "csdegree.",
                                     "s_search_status.",
                                     "s_remote.",
                                     "s_nb_reports.",
                                     "highest_jobtitle.",
                                     "batch_length.",
                                     "nbpastbatch.",
                                     "ranking.",
                                     "wishind_",
                                     "wishskill_",
                                     "csize",
                                     "cpath_",
                                     "cgoal_")),
                ivyplus,
                s_sponsorship,
                employed,
                days_unemployed,
                faang,
                html,
                java,
                python,
                javascript,
                ios,
                pointnet,
                android,
                sql,
                c,
                ruby,
                dataanalysis,
                php,
                nodejs,
                css,
                react,
                go,
                r,
                saas,
                linux,
                agile,
                angular,
                swift,
                hadoop,
                scala,
                s_grad_year, 
                linkedin,
                website) |> 
  dplyr::select(-c(ends_with(".0")), 
                -ranking.1,
                -s_choice_location.1,
                -s_current_location.1,
                -s_primary_field.1,
                -batch_length.15) |>
  names()

## preparing needed formula: ----
# formula part that goes inside parenthesis:
in_parenthesis <- paste0(sel_var_names, collapse = "+")

in_parenthesis <- paste0("(", in_parenthesis, ")")

# formula part that goes outside parenthesis:  
out_parenthesis <- SGenderMetaD |>
  dplyr::select(starts_with("smonthyear"),
                -smonthyear,
                -smonthyear672) |>
  names() |>
  paste0(collapse = "+") 

# formula:
fm <- formula(paste0("logs_salary ~", in_parenthesis, "+ female.female *", in_parenthesis, "+ female.unknown *", in_parenthesis, "+", out_parenthesis))

## SPE: ----
gap <- SortedEffects::spe(fm = fm, data = SGenderMetaD,   var = "female.female",
                          subgroup = SGenderMetaD[,"female"] == 1,  method = "ols",
                          alpha = 0.05, us = c(5:95)/100, b = 500, bc = TRUE)

# plot the SPE:
pdf("/Users/roussille/Dropbox/paygap/Work/replication/askgap/figureC1.pdf", family = "Times")
plot(x = gap, sub = "OLS model", family = "Times",
     xlab = "Percentile index", ylab = "Gender Ask gap", ylim = c(-0.1, 0.1))
grid(nx = NULL, ny = NULL,
     lty = 3,      # Grid line type
     col = "lightgray", # Grid line color
     lwd = 1) 
par(new = TRUE)
plot(x = gap, sub = "OLS model", family = "Times",
     xlab = "Percentile index", ylab = "Gender Ask gap", ylim = c(-0.1, 0.1))
dev.off()

## classification analysis: ----
Char <- SortedEffects::ca(fm =fm, data = SGenderMetaD, var = "female.female",
                          t = sel_var_names, cl = "both", b = 500, 
                          subgroup = SGenderMetaD[,"female"] == 1, 
                          bc = FALSE, u = 0.1)



Chartable <- summary(Char)

# turning into df
Chartable_df <- Chartable |> 
  as.data.frame() 

# changing repeated names
names(Chartable_df)[2] <- "SE_least"

names(Chartable_df)[4] <- "SE_most"

# preparing table to export
Chartable_df |>
  tibble::rownames_to_column() |>
  dplyr::filter(rowname %in% c("s_total_exp",
                               "s_primary_field_exp.1",
                               "s_primary_field_exp.2",
                               "s_primary_field_exp.3",
                               "s_primary_field_exp.4",
                               "s_primary_field_exp.5",
                               "employed",
                               "days_unemployed",
                               "ivyplus",
                               "csdegree.1",
                               "java",
                               "html",
                               "python",
                               "javascript",
                               "sql",
                               "dataanalysis",
                               "pointnet",
                               "c",
                               "nodejs",
                               "css",
                               "react")) |>
  dplyr::arrange(factor(rowname, levels = c("s_total_exp",
                                            "s_primary_field_exp.1",
                                            "s_primary_field_exp.2",
                                            "s_primary_field_exp.3",
                                            "s_primary_field_exp.4",
                                            "s_primary_field_exp.5",
                                            "employed",
                                            "days_unemployed",
                                            "ivyplus",
                                            "csdegree.1",
                                            "java",
                                            "html",
                                            "python",
                                            "javascript",
                                            "sql",
                                            "dataanalysis",
                                            "pointnet",
                                            "c",
                                            "nodejs",
                                            "css",
                                            "react"
  ))) |>
  dplyr::mutate(rowname = dplyr::case_when(
    rowname == "s_total_exp" ~ "Total years o experience",
    rowname == "s_primary_field_exp.1" ~ "Position experience = 2-4 years",
    rowname == "s_primary_field_exp.2" ~ "Position experience = 4-6 years",
    rowname == "s_primary_field_exp.3" ~ "Position experience = 6-10 years",
    rowname == "s_primary_field_exp.4" ~ "Position experience = 10-15 years",
    rowname == "s_primary_field_exp.5" ~ "Position experience = 15+ years",
    rowname == "employed" ~ "Employed",
    rowname == "days_unemployed" ~ "Days unemployed",
    rowname == "ivyplus" ~ "Ivy League Plus",
    rowname == "csdegree.1" ~ "CS degree",
    rowname == "java" ~ "Java",
    rowname == "html" ~ "HTML",
    rowname == "python" ~ "Python",
    rowname == "javascript" ~ "JavaScript",
    rowname == "sql" ~ "SQL",
    rowname == "dataanalysis" ~ "Data analysis",
    rowname == "pointnet" ~ "Pointnet",
    rowname == "c" ~ "C",
    rowname == "nodejs" ~ "Node JS",
    rowname == "css" ~ "CSS",
    rowname == "react" ~ "React"
  )) |>
  dplyr::rename("5% Smallest ask gap" = Most,
                "5% Highest ask gap" = Least,
                " " = rowname,
                "SE (Small)" = SE_least,
                "SE (High)" = SE_most) |>
  xtable::xtable(type = 'latex') |>
  # printing table:
  print(include.rownames = FALSE, 
        file = "/Users/roussille/Dropbox/askgap/replication/Tables/tableC1.tex")
